library(data.table)
library(DT)
library(ggplot2)
library(gridExtra)
library(leaflet)
library(flexdashboard)
library(shiny)
library(rmarkdown)
library(knitr)
library(Hmisc)
library(DT)
library(tidyverse)
library(scales)
library(base)
library(ggrepel)
library(mapdata)
library(RColorBrewer)
library(lattice)
library(magrittr)
library(plotly)
library(leaflet, quietly = TRUE)
library(maps, quietly = TRUE)
library(tidytext)
library(wordcloud)
library(stringr)
library(reshape2)
library(ROCR)
library(tree)
library(randomForest)
library(maptree)
library(class)
library(gbm)
library(e1071)
# library(imager)
library(rpart)
library(caret)
library(purrr)
library(adabag)It is often said that making ourselves “Visible” online is the key to success. From day one at school, we were told to create a LinkedIn account with a “Good Looking” profile picture. This applies to most of the business as well. Companies pay billions of dollars just to make customers “Aware” of their products or service. However, for a local business, it is difficult to do so for the limited budget and other reasons. Therefore, it is a great niche market. There are many platforms out there that can help local business owners promote their products or service such as Yelp, google map, Trip Advisor, and such. And Yelp is the most famous one. (We chose the Yelp over others not because we like the Yelp more than others but because Yelp dataset was easy to retrieve)
Yet, there is one question remained unanswered: Is putting information on Yelp helpful for a local business to grow? If so, how it could help the local business to sustain their business? According to the actual review rate of the Yelp business app users, it says otherwise. Looking at these two different stories, we decided to investigate whether Yelp is beneficial to a local business’s “sustainability”.
To work in the format that we are familiar with, we had to convert the JSON into CSV. Since files are too big, we had to take an extra route. We first saved them as RDS file, before we finally write them to the CSV files.
library(jsonlite)
## Get business from JSON (need reconstruction) biz <-
## stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/business.json'))
## biz2<-data.frame(cbind(biz[,1:11],biz$attributes,biz[,13],biz$hours))
## #reconstructe data into data.frame
## colnames(biz2)[51]<-colnames(biz)[13]
# Data cleaning of business: Part 1: Rename variables
# setnames(x=biz2,old =
# names(biz2)[12:50],new=sprintf('Attributes_%s',names(biz2)[12:50]))
# setnames(x=biz2,old =
# names(biz2)[52:58],new=sprintf('Hours_%s',names(biz2)[52:58]))
# names(biz2)
## Get other 5 dataset from JSON full_review <-
## stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/review.json')) user<-
## stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/user.json'))
## tip<-stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/tip.json'))
## checkin<-stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/checkin.json'))
## photo<-stream_in(file('~/Desktop/Apply DS/Final
## Project/Data/yelp_dataset/photo.json'))
## save all 6 datasets in csv
# write.csv(biz2,file='business.csv')
# saveRDS(full_review,'full_review.rds')
# write.csv(user,file='user.csv',row.names=FALSE)
# write.csv(tip,file='tip.csv',row.names=FALSE)
# write.csv( checkin,file='heckin.csv',row.names=FALSE)
# write.csv( photo,file='photo.csv',row.names=FALSE)# Business EDA
business <- fread(input = "../Data/business.csv", verbose = FALSE)
# business.df <- read.csv('../Data/business.csv')
user <- fread(input = "../Data/user.csv", verbose = FALSE)
restaurant <- fread(input = "../Data/restaurant.csv", verbose = FALSE)cities.name <- c("All", "Phoenix", "Las Vegas")
business.id.name = "business_id"
business.name.name = "name"
address.name = "address"
city.name = "city"
state.name = "state"
zipcode.name = "postal_code"
latitude.name = "latitude"
longitude.name = "longitude"
stars.name = "stars"
review.coutn.name = "review_count"
category.name = "categories"
is.open.name = "is_open"
cuisine.name = "cuisine.info"
attributes = business[, c(2, 13:51)]
hour = business[, c(2, 53:59)]
new.business = business[, c(2:12, 52)]
user.id.name = "user_id"
elite.name = "elite"
friends.name = "friends"
average.stars.name = "average_stars"
helpful.name = "helpful"
fans.name = "fans"
num.of.elites.name = "num_of_elites"
num.of.friends.name = "num_of_friends"
final.score.name = "final_score"
influencer.name = "influencer"
user.key.cols.names <- c("review_count", "useful", "fans",
"num_of_elites", "num_of_friends")
pattern.attributes <- "Attributes_"
attributes.list <- names(business)[grep(pattern = pattern.attributes,
x = names(business))]
unique.id <- business[, unique(get(business.id.name))]
unique.name <- business[, unique(get(business.name.name))]
unique.address <- business[, unique(get(address.name))]
unique.city <- business[, unique(get(city.name))]
unique.state <- business[, unique(get(state.name))]
unique.zipcod <- business[, unique(get(zipcode.name))]
unique.cuisine <- restaurant[, unique(get(cuisine.name))]
unique.restaurant <- restaurant[, unique(get(business.id.name))]
num.business <- length(unique.id)
num.restaurant <- length(unique.restaurant)
num.cuisine <- length(unique.cuisine)
respondent.variables <- c(city.name, state.name, cuisine.name,
review.coutn.name, stars.name)
dependent.variables <- c(is.open.name)
# Review EDA (part3 data)
sorted.variables = c("total.sentiment.score", "total.num.post",
"negative", "positive", "positive.sentiment.ratio")
top10_afin_result <- full.restaurant.w.cuisine2 %>% arrange(desc(total.sentiment.score))
bing_df2 <- data.table(bing_df2)
top10.name = as.vector(unlist(unique(bing_df2[business_id %in%
top10_afin_result$business_id[1:10], "name"])))
type_wordcloud = c("Positive", "Negative", "Both")
## Review EDA (part4 data)
top10.year = c(max(new.bing_df2$year):min(new.bing_df2$year))
top10.month = c(min(new.bing_df2$month):max(new.bing_df2$month))
cuisine_american <- c("American", "American (New)", "Breakfast & Brunch",
"Burgers", "Fast Food", "American (Traditional)", "Breakfast & Brunch",
"Burgers", "Barbeque", "Cheesesteaks", "Cajun/Creole",
"Steakhouses", "Sandwiches", "Pizza", "Hot Dogs", "Chicken Wings",
"Hawaiian", "Delis", "Soul Food", "Diners", "Bagels")
cuisine_bars <- c("Bars", "Beer Bar", "Breweries", "Dive Bars",
"Cocktail Bars", "Music Venues", "Nightlife", "Sports Bars",
"Wine Bars", "Pubs", "Gastropubs", "Lounges", "Karaoke",
"Dance Clubs")
cuisine_cafe <- c("Cafes", "Bubble Tea", "Bakeries", "Breakfast & Brunch",
"Bagels", "Juice Bars & Smoothies", "Coffee & Tea",
"Ice Cream & Frozen Yogurt")
cuisine_asian <- c("Asian", "Asian Fusion")
cuisine_Vietnamese <- c("Vietnamese", "Pho")
cuisine_vege <- c("Vege", "Gluten-Free", "Vegetarian", "Vegan")
cuisine_south_asia <- c("South_Asia", "Bangladeshi", "Indian",
"Pakistani", "Sri Lankan")
# cuisine_east_asia <- c('East_Asia', 'Chinese',
# 'Japanese','Korean')
cuisine_se_asia <- c("South_East_Asia", "Filipino", "Indonesian",
"Laotian", "Malaysian", "Singaporean", "Vietnamese")
cuisine_europe <- c("European", "French", "Greek", "Italian",
"Portuguese", "Polish", "Russian", "Ukrainian")
cuisine_ls <- c(cuisine_american, cuisine_bars, cuisine_cafe,
cuisine_asian, cuisine_Vietnamese, cuisine_vege)
# levels(as.factor(DT$cuisine.info))
N <- 10000
ls.models <- c("Classification Tree", "Random Forest", "Logistic Regression",
"Support Vector Machine", "KNN", "Naive Bayes")
count.fig = 1
count.table = 1round.numerics <- function(x, digits) {
if (is.numeric(x)) {
x <- round(x = x, digits = digits)
}
return(x)
}
count_num <- function(x) {
l <- unlist(strsplit(x, split = ",", fixed = TRUE))
return(length(l))
}
scaling_user <- function(x) {
return((x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) -
min(x, na.rm = TRUE)))
}
fill_in_NA <- function(cuisine_ls) {
for (i in cuisine_ls) {
DT[, `:=`("idx", str_detect(DT$categories, i))]
DT[get("idx") == 1, `:=`(eval(cuisine.name), cuisine_ls[1])]
# DT[get('idx') == 1 & is.na(get(cuisine.name))==TRUE,
# eval(cuisine.name) := cuisine_ls[1]]
DT[, `:=`("idx", NULL)]
}
}
erate <- function(predicted.value, true.value) {
return(mean(true.value != predicted.value))
}
# Set formula
create.formula <- function(outcome.name, input.names, input.patterns = NA,
all.data.names = NA, return.as = "character") {
variable.names.from.patterns <- c()
if (!is.na(input.patterns[1]) & !is.na(all.data.names[1])) {
pattern <- paste(input.patterns, collapse = "|")
variable.names.from.patterns <- all.data.names[grep(pattern = pattern,
x = all.data.names)]
}
all.input.names <- unique(c(input.names, variable.names.from.patterns))
all.input.names <- all.input.names[all.input.names !=
outcome.name]
if (!is.na(all.data.names[1])) {
all.input.names <- all.input.names[all.input.names %in%
all.data.names]
}
input.names.delineated <- sprintf("`%s`", all.input.names)
the.formula <- sprintf("`%s` ~ %s", outcome.name, paste(input.names.delineated,
collapse = " + "))
if (return.as == "formula") {
return(as.formula(the.formula))
}
if (return.as != "formula") {
return(the.formula)
}
}
reduce.formula <- function(dat, the.initial.formula, max.categories = NA) {
require(data.table)
dat <- setDT(dat)
the.sides <- strsplit(x = the.initial.formula, split = "~")[[1]]
lhs <- trimws(x = the.sides[1], which = "both")
lhs.original <- gsub(pattern = "'", replacement = "",
x = lhs)
if (!(lhs.original %in% names(dat))) {
return("Error: Outcome variable is not in names(dat).")
}
the.pieces.untrimmed <- strsplit(x = the.sides[2], split = "+",
fixed = T)[[1]]
the.pieces.untrimmed.2 <- gsub(pattern = "'", replacement = "",
x = the.pieces.untrimmed, fixed = T)
the.piece.in.names <- trimws(x = the.pieces.untrimmed.2,
which = "both")
the.pieces <- the.piece.in.names[the.piece.in.names %in%
names(dat)]
num.variables <- length(the.pieces)
include.pieces <- logical(num.variables)
for (i in 1:num.variables) {
unique.values <- dat[, unique(get(the.pieces[i]))]
num.unique.values <- length(unique.values)
if (num.unique.values >= 2) {
include.pieces[i] <- TRUE
}
if (!is.na(max.categories)) {
if (dat[, is.character(get(the.pieces[i])) |
is.factor(get(the.pieces[i]))] == TRUE) {
if (num.unique.values > max.categories) {
include.pieces[i] <- FALSE
}
}
}
}
pieces.rhs <- sprintf("'%s'", the.pieces[include.pieces ==
TRUE])
rhs <- paste(pieces.rhs, collapse = "+")
the.formula <- sprintf("%s ~%s", lhs, rhs)
return(the.formula)
}
eval.dat <- function(x) {
return(eval(as.name(x)))
}
### Business
fill_in_NA <- function(cuisine_ls) {
cat = cuisine_ls[1]
# print(cat)
for (i in cuisine_ls) {
DT[, `:=`("idx", str_detect(DT$categories, i))]
# DT[get('idx') == 1, eval(cuisine.name) :=
# cuisine_ls[1]]
DT[get("idx") == 1 & is.na(get(cuisine.name)) ==
TRUE, `:=`(eval(cuisine.name), cat)]
DT[, `:=`("idx", NULL)]
}
}
combine_regions <- function(region_ls) {
cat = region_ls[1]
for (i in region_ls) {
DT[, `:=`("idx", str_detect(DT$cuisine.info, i))]
DT[get("idx") == 1, `:=`(eval(cuisine.name), cat)]
DT[, `:=`("idx", NULL)]
}
}
fill_results <- function(result, model_name) {
record[model_name, ] <<- result[[1]]
auc[, model_name] <<- result[[2]]
}The Yelp dataset is a subset of Yelp’s businesses, reviews, and user data for use in personal, educational, and academic purposes. The Yelp dataset is a subset of Yelp’s businesses, reviews, and user data for use in personal, educational, and academic purposes. The original goal of the Yelp competition was to predict the star (rate) of a local business given the information. The URL of the original dataset is followed : https://www.yelp.com/dataset/challenge.
The whole dataset consists of six JSON files:
business.json: Contains business data including location data, attributes, and categories.
review.json: Contains full review text data including the user_id that wrote the review and the business_id the review is written for.
user.json: User data including the user’s friend mapping and all the metadata associated with the user.
checkin.json: Checkins on a business.
tip.json: Tips are written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.
photo.json: Contains photo data including the caption and classification (one of “food”, “drink”, “menu”, “inside” or “outside”)
Among the 6 datasets, we only employed the business, review, and user dataset. Each data is moderately huge to deal with. After working on feature engineering and manipulating data to work smoothly, we ended up with 22 GB only with those three.
The User dataset contains more than 1.6 million unique users, along with 23 variables for each user, including basic information like id and name, as well as attributes like number of reviews given, elite membership years, and average stars given. The last 11 variables are attributes from the compliment function on Yelp, where other users can send compliments on a particular post. Due to time constraints, we chose not to incorporate those attributes into our EDA.
# Add two new columns to store number of years being
# elite member and number of friends
user[, `:=`(eval(num.of.elites.name), count_num(elite)),
by = V1]
user[, `:=`(eval(num.of.friends.name), count_num(friends)),
by = V1]We started off by taking a deeper look at number of reviews per user. From the summary and density plot, we could see that the majority of users do not post many reviews, but a few users posted up to 13278 reviews. When we narrowed down to users with up to 75 posts, it is clear to conclude that most users posted less than 10 posts.
# Summary of number of reviews per user user[,
# summary(get(review.coutn.name))]
# Plot Number of Reviews per User
ggplot(user) + geom_density(aes(get(review.coutn.name)),
fill = "coral3", color = "coral3") + labs(title = "Number of Reviews per User") +
theme(plot.title = element_text(hjust = 0.5), legend.position = "none")# Plot users with less than 75 reviews
ggplot(subset(user, get(review.coutn.name) <= 75)) + geom_histogram(aes(review_count),
fill = "coral3", color = "white", binwidth = 4) + labs(title = "Number of Reviews per User (<=75)") +
theme(plot.title = element_text(hjust = 0.5), legend.position = "none")On the histogram of average ratings among all users, the top two scores that most users tend to give is 5 and 1, which intuitively makes sense, because people are more likely to share their comments when they have an extremely satisfying experience, or on the other hand, a horrible experience. Overall, the average rating among all users was 3.681.
# Summary of users' average ratings user[,
# summary(get(average.stars.name))]
# Plot users' average ratings
ggplot(user) + geom_histogram(aes(get(average.stars.name)),
fill = "coral3", color = "white", binwidth = 0.1) +
labs(title = "Average Ratings") + theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")We then continued our EDA by investigating the number of years for each user being an elite member, a measurement for a user’s level of activeness. The summary and histogram again presented an imbalanced result, where over 1.5 million users have never been an elite member, but the maximum value is 13 years.
# Summary of Number of Years Being A Elite Member user[,
# summary(get(num.of.elites.name))]
# Plot Number of Years Being A Elite Member
ggplot(user) + geom_histogram(aes(get(num.of.elites.name)),
fill = "coral3", color = "white", binwidth = 0.5) +
labs(title = "Number of Years Being A Elite Member") +
xlim(-0.5, 6.5) + theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")Similarly, for number of friends per user, most users only have one friend, whereas the maximum value is 14995 friends.
# Summary of Number of Friends per User user[,
# summary(get(num.of.friends.name))]
# Plot users with less than 20 friends
ggplot(subset(user, get(num.of.friends.name) <= 20)) + geom_histogram(aes(get(num.of.friends.name)),
fill = "coral3", color = "white", binwidth = 1) + labs(title = "Number of Friends per User (<=20)") +
theme(plot.title = element_text(hjust = 0.5), legend.position = "none")A key message we have received so far is that among all the 1.6 million uses, a few of them are extremely active and influencial, so our final goal for EDA of user dataset is to identify those users. We first selected a few key variables to include in the data.table, which are review_count, useful, fans, num_of_elites and num_of_friends. After that we defined a formula to calculate a final score for each user (equal weights of 20% was assumed in our case, but the formula is subject to change). Finally, we identified influencers to be those who are one standard deviation away from the mean of final score, which accounted for 4% of total population of users.
# Create a new data.table to store key attributes
user_key_attributes <- cbind(user[, get(user.id.name)],
user[, lapply(X = .SD, FUN = scaling_user), .SDcols = user.key.cols.names])
setnames(x = user_key_attributes, old = "V1", new = "user_id")
# Calculate final score for each user
user_key_attributes[, `:=`(eval(final.score.name), 20 *
get(review.coutn.name) + 20 * useful + 20 * fans + 20 *
num_of_elites + 20 * num_of_friends)]
# Define influencers
user_key_attributes[, `:=`(eval(influencer.name), 1 * (get(final.score.name) >
mean(get(final.score.name)) + sd(get(final.score.name))))]
# Summary of influencers user_key_attributes[,
# summary(get(influencer.name))]
datatable(data = user_key_attributes[1:100, lapply(X = .SD,
FUN = "round.numerics", digits = 3)], rownames = FALSE,
extensions = DT.extensions)# Spliting the business into 3 diff datasets : business,
# attribute, and hour
attributes = business[, c(2, 13:51)]
hour = business[, c(2, 53:59)]
new.business = business[, c(2:12, 52)]
# dim(new.business) names(new.business)Business data originally had 58 variables, including but not limited to:
business_id: 22 character unique string business idname: the business’s nameaddress: the full address of the businesscity: the citystates: 2 character state code, if applicablepostal_code: the postal codelatitude/longitude: latitude / longitudestars: star ratingreview_count: number of reviewsis_open: 0 or 1 for closed or open, respectivelyattributes: business attributes to values. note: some attribute values might be objects
hours: hours are using a 24hr clock
Both attributes and hour variable contains too many missing variables and in peculiar format, for instance, {‘dessert’: False, ‘latenight’: False, ‘lunch’: True, ‘dinner’: True, ‘brunch’: False, ‘breakfast’: False}. It was hard to find a way to fill in those missing values, we therefore decided to move on without these two variables. Yet, we still believe that these two could have an impact on our result. If we had more time, we would definitely work on these variables to incorporate them into our classification.
# Tracking top famous categories in Yelp business data
category.tab = data.table(Category = unlist(strsplit(new.business$categories,
",")))
category.tab = category.tab[, .N, by = Category]
setnames(x = category.tab, old = "N", new = "Count")
setorderv(x = category.tab, cols = "Count", order = -1)
datatable(category.tab, extensions = DT.extensions)# Visualize the Top 10 categories
top10_category_plot = ggplot(data = category.tab[1:10],
aes(x = reorder(Category, Count), y = Count, fill = Category)) +
geom_bar(stat = "identity", color = "black") + geom_text(aes(x = Category,
y = 1, label = paste0("(", round(Count/1000), " K )",
sep = "")), hjust = 0, vjust = 0.5, size = 3, colour = "black",
fontface = "bold") + labs(x = "Category", y = "Count",
title = "Top 10 Category in Yelp") + coord_flip() +
theme_bw()
print(top10_category_plot)states <- data.table(map_data("state"))
business_sum <- business[, .(num = .N, mean_stars = mean(get(stars.name)),
long = mean(get(longitude.name)), lat = mean(get(latitude.name))),
keyby = state.name]
restaurant_sum <- restaurant[, .(num = .N, mean_stars = mean(get(stars.name)),
long = mean(get(longitude.name)), lat = mean(get(latitude.name))),
keyby = state.name]
two_states_reg <- subset(states, (region == "arizona") |
(region == "nevada"))
two_states_sum <- subset(restaurant_sum, (state == "AZ") |
(state == "NV"))
two_cites <- subset(restaurant, (city == "Phoenix") | (city ==
"Las Vegas"))
# Plot all businesses in the U.S.
ggplot() + geom_polygon(data = states, aes(x = long, y = lat,
group = group), color = "white", fill = "grey") + coord_fixed(1.3) +
guides(fill = FALSE) + geom_point(data = business_sum,
aes(x = long, y = lat, colour = mean_stars, size = num)) +
scale_size(name = "Number of Businesses") + geom_label_repel(data = business_sum,
aes(x = long, y = lat, label = state, fill = factor(state)),
size = 2) + labs(title = "All Businesses in the US and Canada") +
theme(plot.title = element_text(hjust = 0.5))Rest of the data was relatively clean: no missing data, no atypical format. From exploring the data, particularly the category variable, we found out that Yelp deals with more than restaurants. They have different types of business, including shopping, insurance, hospital, hotels, bars, party bus, sewing, real estate, restaurants, and other variety of industries. The top categories that Yelp work with are shown in the grpahs. Coincidentally, among the many categories, restaurants were enlisted the most and we, as foodies, decided to investigate only the restaurants.
ggplot() + geom_polygon(data = states, aes(x = long, y = lat,
group = group), color = "white", fill = "grey") + coord_fixed(1.3) +
guides(fill = FALSE) + geom_point(data = restaurant_sum,
aes(x = long, y = lat, colour = mean_stars, size = num)) +
scale_size(name = "Number of Restaurants") + geom_label_repel(data = restaurant_sum,
aes(x = long, y = lat, label = state, fill = factor(state)),
size = 2) + labs(title = "All Restaurants in the US and Canada") +
theme(plot.title = element_text(hjust = 0.5))w = new.business[, grepl(pattern = "Restaurants", x = categories)]
restaurant = new.business[w, ]
# From now on we will work on restaurant dataset,
# instead of new.business
# Find what is the most famous cuisine among the ones I
# named in restaurant
cuisine.list = c("Ainu", "Albanian", "Argentina", "Andhra",
"Anglo-Indian", "Arab", "Armenian", "Assyrian", "Awadhi",
"Azerbaijani", "Balochi", "Belarusian", "Bangladeshi",
"Bengali", "Berber", "Buddhist", "Bulgarian", "Cajun",
"Chechen", "Chinese", "Chinese Islamic", "Circassian",
"Crimean Tatar", "Cypriot", "Danish", "Estonian", "French",
"Filipino", "Georgian", "Goan", "Goan Catholic", "Greek",
"Gujarati", "Hyderabad", "Indian", "Indian Chinese",
"Singaporean", "Indonesian", "Inuit", "Italian American",
"Italian", "Japanese", "Jewish", "Karnataka", "Kazakh",
"Keralite", "Korean", "Kurdish", "Laotian", "Latvian",
"Lithuanian", "Louisiana Creole", "Maharashtrian", "Mangalorean",
"Malay", "Chinese", "Malaysian", "Indian", "Mediterranean",
"Mexican", "Mordovian", "Mughal", "Native American",
"Nepalese", "New Mexican", "Odia", "Parsi", "Pashtun",
"Polish", "Pennsylvania Dutch", "Pakistani", "Peranakan",
"Persian", "Peruvian", "Portuguese", "Punjabi", "Rajasthani",
"Romanian", "Russian", "Sami", "Serbian", "Sindhi",
"Slovak", "Slovenian", "Somali", "South Indian", "Sri Lankan",
"Taiwanese", "Tatar", "Thai", "Turkish", "Tamil", "Udupi",
"Ukrainian", "Yamal", "Zanzibari", "American")
# reference:
# https://en.wikipedia.org/wiki/List_of_cuisines
# Adding Cuisine.info into the restaurant table.
categories.list <- strsplit(restaurant$categories, ", ")
cuisine.info <- c()
for (i in 1:nrow(restaurant)) {
if (any(categories.list[[i]] %in% cuisine.list) == FALSE) {
cuision = "others"
} else {
cuision <- categories.list[[i]][which(categories.list[[i]] %in%
cuisine.list)]
if (length(cuision) > 1) {
cuision <- cuision[1]
}
}
cuisine.info <- append(cuisine.info, cuision)
}
restaurant.w.cuisine = cbind(restaurant, cuisine.info)
restaurant.w.cuisine[cuisine.info %in% c("American (New)",
"Breakfast & Brunch", "Burgers", "Fast Food", "American (Traditional)",
"Breakfast & Brunch", "Burgers", "Barbeque", "Steakhouses",
"Sandwiches", "Pizza", "Hot Dogs"), `:=`(cuisine.info,
"American")]
restaurant.w.cuisine[cuisine.info %in% c("Soul Food"), `:=`(cuisine.info,
"Korean")]
restaurant.w.cuisine[cuisine.info %in% c("Sushi Bars"),
`:=`(cuisine.info, "Japanese")]
cuisine.tab <- restaurant.w.cuisine[, .N, by = cuisine.info]
setnames(cuisine.tab, old = "cuisine.info", new = "Cuisine")
setnames(cuisine.tab, old = "N", new = "Count")
setorderv(x = cuisine.tab, cols = "Count", order = -1)
# Visualize the Top 10 cuisine
top10_cuisine_plot = ggplot(data = cuisine.tab[2:11], aes(x = reorder(Cuisine,
Count), y = Count, fill = Cuisine)) + geom_bar(stat = "identity",
color = "black") + geom_text(aes(x = Cuisine, y = 1,
label = paste0("(", round(Count/1000), " K )", sep = "")),
hjust = -0.5, vjust = 0.5, size = 3, colour = "black",
fontface = "bold") + labs(x = "Cuisine", y = "Count",
title = "Top 10 Cuisine in Yelp") + coord_flip() + theme_bw()
print(top10_cuisine_plot)When we select a restaurant, the first thing that we do is choose cuisine: Chinese, American, Mexican, Japanese, Korean, and such. Hence, we narrow down the restaurants by cuisines once again. Majority of them were not categorized as we planned. Still, it gave us a good idea which cuisine is the most popular in the United States. Mexican restaurants account for the most in the dataset. This makes sense because the majority of the immigrants in the states are from South America, and they are relatively cheap considering its quality. We presume that if we have the Yelp dataset from another country, then it would be different, i.e., Korean or Japanese restaurants would be the most popular ones in China. Besides, we bump into many Mexican restaurants whether that is food truck or actual restaurants in our daily lives.
# business by state
state.tab = restaurant.w.cuisine[, .(Count = .N), by = state]
setorderv(state.tab, cols = "Count", order = -1)
state_plot = ggplot(data = state.tab[1:10], aes(x = reorder(state,
Count), y = Count, fill = state)) + geom_bar(stat = "identity",
colour = "white") + geom_text(aes(x = state, y = 1,
label = paste0("(", round(Count/1000), " K )", sep = "")),
hjust = 0, vjust = 0.5, size = 4, colour = "black",
fontface = "bold") + labs(x = "State", y = "Count",
title = "Top 10 States in Yelp") + coord_flip() + theme_bw()
# business by city
city.tab = restaurant.w.cuisine[, .(Count = .N), by = city]
setorderv(city.tab, cols = "Count", order = -1)
city_plot = ggplot(data = city.tab[1:10], aes(x = reorder(city,
Count), y = Count, fill = city)) + geom_bar(stat = "identity",
colour = "white") + geom_text(aes(x = city, y = 1, label = paste0("(",
round(Count/1000), " K )", sep = "")), hjust = 0, vjust = 0.5,
size = 4, colour = "black", fontface = "bold") + labs(x = "City",
y = "Count", title = "Top 10 Cities in Yelp") + coord_flip() +
theme_bw()
grid.arrange(state_plot, city_plot)We expanded our exploration on the business data to states and city to have a better understanding of the data. The reason behind this is we wanted to craft a story for at least one or two restaurants as Professor recommended us. Interestingly, we discovered that Ontario, a state in Canada, has the most number of restaurants, followed by Arizona and Nevada. Originally, we wanted to compare the surviving rate between the restaurants in Ontario and that in Arizona; however, when we first convert the data we only took the first 100 thousand data from review. And, we could not find a sufficient number of restaurant review data from the review data. So, we agreed to go with the two most popular states, Arizona and Nevada. In terms of the city, we see that Toronto has the most number of restaurants listed and Las Vegas and Pheonix account for the second and third most. From here, one could claim that restaurants data are not condensed in Pheonix but allocated in many different cities in Pheonix, while most of the restaurant’s data from Nevada are from Las Vegas.
AZ_cities_numb = restaurant.w.cuisine[get(state.name) ==
"AZ", length(unique(get(city.name)))]
# print(AZ_cities_numb) # 57
NV_cities_numb = restaurant.w.cuisine[get(state.name) ==
"NV", length(unique(get(city.name)))]
# print(NV_cities_numb) # 23
AZ_NV_restaurant_numb = restaurant.w.cuisine[get(state.name) ==
"AZ" | get(state.name) == "NV", .N]
# print(AZ_NV_restaurant_numb) # 19248
# Narrowing down the scope} full.restaurant.w.cuisine2
# [get(state.name) == 'AZ',(count=.N),by=is_open]
# full.restaurant.w.cuisine2 [get(state.name) ==
# 'AZ',(count=.N),by=is_open]To confirmed this assertion, we took a look at the number of cities listed in the dataset for both AZ and NV. And, the number of cities listed under AZ, 57, is greater than that under NV,23. As we mentioned briefly, our goal was to narrow down our data analysis for either one or two restaurants and craft story out of it, we decided to move along with only AZ and NV.
# Overall surviving rate
overall.surviving.rate = restaurant.w.cuisine[, .N, by = is.open.name]
overall.surviving.rate = overall.surviving.rate[1, 2]/nrow(restaurant)
# Simply take a look at the number of restaurants are
# open and close
vegas.open.tab = restaurant.w.cuisine[get(city.name) ==
"Las Vegas", .N, by = is.open.name]
phoenix.open.tab = restaurant.w.cuisine[get(city.name) ==
"Phoenix", .N, by = is.open.name]
# Calcualting survival rates for both
vegas.open.tab[, `:=`(Total, sum(N))]
vegas.open.tab[, `:=`(Rate, (N/Total) * 100)]
setorderv(vegas.open.tab, cols = "is_open", order = -1)
vegas.surviving.rate = vegas.open.tab[1, 4]
phoenix.open.tab[, `:=`(Total, sum(N))]
phoenix.open.tab[, `:=`(Rate, (N/Total) * 100)]
phoenix.surviving.rate = phoenix.open.tab[1, 4]
# vegas.surviving.rate > phoenix.surviving.rate
# vegas.open.tab[1,4] > phoenix.open.tab[1,4]To calculate the sustainability of restaurants, we consider Is_open variable as our outcome of regression. First, we calculated the overall surviving rates of the entire restaurant, and compare with that of AZ and NV. Overall surviving rate is 0.7114079 when that of Vegas is 65.4418605 and that of Pheonix is 67.991998.
In genearl, Vegas has more restaurants, including both still in business and not in business anymore. Yet, the surviving rate in Vegas is lower than Phoenix. In other words, restaurant business is more competitive in Vegas than in Phoenix.
vegas.surviving = restaurant.w.cuisine[get(city.name) ==
"Las Vegas", .(`# of restaurants` = .N, surviving.rate = mean(is_open,
na.rm = TRUE)), by = cuisine.info]
setorderv(vegas.surviving, "# of restaurants", -1)
# datatable(vegas.surviving, caption = 'Las Vegas
# Surviving Rate')
phoenix.surviving = restaurant.w.cuisine[get(city.name) ==
"Phoenix", .(`# of restaurants` = .N, surviving.rate = mean(is_open,
na.rm = TRUE)), by = cuisine.info]
setorderv(phoenix.surviving, "# of restaurants", -1)
# datatable(phoenix.surviving, caption = 'Phoenix
# Surviving Rate')
# Combine Two
survival_rate_tab <- rbind(vegas.surviving %>% mutate(Area = "Las Vegas"),
phoenix.surviving %>% mutate(Area = "Phoenix")) %>%
as.data.table
# setnames(survival_rate_tab,colnames(survival_rate_tab),
# c('Cuisine Info', 'Num of Restaurants', 'Surviving
# Rate', 'Area'))
datatable(survival_rate_tab, colnames = c("Cuisine Info",
"Num of Restaurants", "Surviving Rate", "Area"), caption = "Surving Rate",
extensions = DT.extensions)# knitr::kable(survival_rate_tab, caption = 'Surving
# Rate') %>% kableExtra::kable_styling('striped',
# full_width = T) %>% kableExtra::pack_rows('Las Vegas',
# 1, nrow(vegas.surviving)) %>%
# kableExtra::pack_rows('Phoenix',nrow(vegas.surviving)+1
# , nrow(survival_rate_tab))We did a similar analysis by cuisine. Except for Mexican restaurants, all other cuisines have lower surviving rate than overall surviving rates. With same reason that it is dataset from US, we could understand the situation.
star.dist.by.open = restaurant.w.cuisine[, .N, by = c(is.open.name,
"stars")]
star.dist.by.open_plot = ggplot(star.dist.by.open, aes(x = stars,
y = N, fill = factor(is_open))) + geom_bar(stat = "identity",
position = "dodge") + scale_color_brewer(palette = "Spectral") +
xlab("Stars") + ylab("Count") + ggtitle("Star Distribution Between Open and Close") +
theme(plot.title = element_text(size = 9.5))
print(star.dist.by.open_plot)Lastly, we did compare the star rates between ones are still open and closed. As one can see there is not much of difference between the two except for the number of rates.
Reviews for any store are valuable, especially for restaurant who need to serve thousands of customers everyday. For yelp user, star rates may be the straightforward way to judge whether a restaurant is good or bad, however, they won’t get any information by just staring at the star, especially when one restaurant only have few star rate. The reason is because star rating for one restaurant tend to lose a lot of information that provided by user, in other words, it is hard to get any specific useful information by only referencing the star that customer rate on that restaurant. Therefore, in the following section, we will look at how review in yelp can help the business by applying text mining technique with sentiment analysis. Text mining, or text analytics, is a useful tool to deriving information from text, and sentiment analysis is one of the most popular application of text mining that widely applied to customer reviews just like reviews from yelp. To uncover opinion from customers’ reviews, we did feature engineering by constructing new feature “sentiment score” to help us indicating the sentiment from customers’ review for each business, which also plays main role as a preditor in fitting prediction model latter on.
# full_review<-readRDS('~/Desktop/Apply DS/Final
# Project/Data/full_review.rds')
## Prepare 'data_clean': only get reviews for restaurant
## in 'restaurant.w.cuisine' (narrow down)
## cuisine.review<-merge(restaurant.w.cuisine,full_review,by='business_id')
## colnames(cuisine.review)[9]<-'business.star.rate'
## colnames(cuisine.review)[16]<-'Customer.review.rate'
## length(unique(cuisine.review$business_id)) ## 59371
## unique business
## length(unique(cuisine.review$review_id)) ## 4201684
## unique post for all 59371
## clean up full review from above
## target_tidy<-tibble(text=cuisine.review$text,business_id=factor(cuisine.review$business_id),post.id=factor(cuisine.review$review_id))
## data_clean<-target_tidy %>% unnest_tokens(word,text)
## %>% group_by(word) %>% anti_join(stop_words)
## write.csv(data_clean,'data_clean.csv')
## Get data_clean (full review and clean up by stop word)
# Load data
data_clean <- readRDS("../Data/data_clean.rds", refhook = FALSE)
# data_clean<-data_clean[,2:4]
# data_clean<-data_clean[2:nrow(data_clean),]
# colnames(data_clean)<-c('business_id','review_id','word')For this project, we mainly focus on unigram-base sentimental analysis, in other word, we need to convert text data into tidy format, and break down the text into one single word per each review for each business. To ensure a high-quality of result for latter on sentimental analysis, we cleaned up our data by removing redundant words such as numbers, stop words, punctuations, whitespace, etc. Since we are unigrams-base, we choose to use two different sentiment lexicons “AFINN” and “bing” for detecting the emotion of each word. The “AFINN” lexicon assigns a score for words that range from -5 (negative) to 5 (positive). The “bing” lexicon categorizes words into either “Positive” or “Negative” groups. Then we can build our new feature called “sentiment” score by using different method. Here we use some easy straightforward ways in calculation. For “AFINN”, we can simply sum up score for each review since there may be some positive word and negative word as well in one review. For “bing”, we calculated the number of positive and negative words for each review so that we can obtain the ratio of positive word for each review. There is many other mathematical way to obtain a representative sentimental score for each review, but here we just tried on the most intuitive way.
Since data is relatively large, we write a seperate file for all restaurant with review and summary statistics of sentiment score.
## Load full.restaurant.w.cuisine
## full.restaurant.w.cuisine<-readRDS('~/Desktop/Apply
## DS/Final Project/full.restaurant.w.cuisine.rds')
## saveRDS(full.restaurant.w.cuisine,'full.restaurant.w.cuisine.rds')
# full.restaurant.w.cuisine2<-readRDS('~/Desktop/Apply
# DS/Final Project/Data/full.restaurant.w.cuisine2.rds')## Following is procedure to get
## 'full.restaurant.w.cuisine': dataset with 'AFINN'
## total sentiment score
# First use 'AFINN' lexicon to detect the sentiment and
# assign score
data_clean_sentiment <- data_clean %>% inner_join((get_sentiments("afinn")))
data_clean_sentiment <- data.table(data_clean_sentiment)
colnames(data_clean_sentiment)[1] <- "business_id"
# Sum up sentiment score for each review for each
# restaurant
aa <- data_clean_sentiment[, .(total.score.each.post = sum(score)),
by = c("business_id", "review_id")]
## group by business and sum up all post score
step3 <- aa[, .(total.sentiment.score = sum(total.score.each.post),
total.num.post = .N), by = "business_id"]
# Merge all sentiment score to 'restaurant.w.cuisine'
full.restaurant.w.cuisine <- merge(restaurant.w.cuisine,
step3, by = "business_id")
# datatable(head(data.table(full.restaurant.w.cuisine)))
## Following is procedure to get
## 'full.restaurant.w.cuisine2': combine result above
## with 'bing' sentiment score
# Use 'bing' to detect the sentiment word and calculate
# number of positive words and negative words for each
# restaurant from all reviews
bing_df <- data_clean %>% inner_join((get_sentiments("bing")))
bing_df <- data.table(bing_df)
bb <- bing_df[, .(count = .N), by = c("business_id", "sentiment")]
# calculate ratio of positive word
bin_way2 <- bb %>% spread(sentiment, count, fill = 0) %>%
mutate(positive.sentiment.ratio = round((positive/(positive +
negative)) * 100, 2)) %>% arrange(desc(positive))
# Merge with 'Afinn' result in
# 'full.restaurant.w.cuisine'
full.restaurant.w.cuisine2 <- merge(full.restaurant.w.cuisine,
bin_way2, by = "business_id")
# saveRDS(full.restaurant.w.cuisine2,'full.restaurant.w.cuisine2.rds')